# export html file
%%shell
jupyter nbconvert --to html ///content/DSPM_HW2_ichou.ipynb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# import data
df = pd.read_csv('analytic_data2022.csv', header =1)
df
1) Answer:
a. Exploratory data analysis and data preparation:
#Step 1. Clean Up data
#keep the columns needed for analysis and delete all the others, that is 'rawvalue' of the "Ranked Measures"
#1a.keep county information and raw data only
for col in df.columns:
if col.startswith('v'):
if 'raw' not in col:
df = df.drop([f'{col}'], axis=1) # delete columns that are not raw data
#1b.drop "Additional Measures", drop all data start from column 'v173_rawvalue'
index_additional = df.columns.get_loc('v173_rawvalue')
df = df.drop(df.iloc[:, index_additional:],axis = 1)
#1c.drop 'year' column, all data is 2022
df = df.drop('year',axis = 1)
#1d.drop state level data, row with missing value in column "country ranked"
df = df.dropna(subset=['county_ranked'])
for col in df.columns:
print(col)
#Step2. Data Exploration with Statistics
df.describe().transpose()
From the dataframe description, we know there are some outliners(extreme large or small value), missing data, and also different scales of features. So, we need to operate data preprocessing as following methods:
1.Remove Outlier
2.Process Missing Data
3.Normalization
#Step3a. check outliers(not include the code adn ranked column)
cols = list(df.columns)
for element in ['state','county']:
cols.remove(element)
for col in cols:
sns.boxplot(x = df[f'{col}'])
#outliner: out of 1.5 IQR range from 1st and 3rd quantile
q1 = df[f'{col}'].quantile(0.25)
q3 = df[f'{col}'].quantile(0.75)
iqr = q3-q1 #Interquartile range
lowbar = q1-1.5*iqr
highbar = q3+1.5*iqr
plt.axvline(lowbar, color='red')
plt.axvline(highbar, color='red')
plt.show()
#Step3b. Remove outliers (not include the code adn ranked column)
df_rm = df.copy()
for col in cols:
if col != 'statecode' and col != 'countycode' and col != 'fipscode'and col != 'county_ranked':
#outliner: out of 1.5 IQR range from 1st and 3rd quantile
q1 = df[f'{col}'].quantile(0.25)
q3 = df[f'{col}'].quantile(0.75)
iqr = q3-q1 #Interquartile range
lowbar = q1-1.5*iqr
highbar = q3+1.5*iqr
#create new dataframe df_outlier without outlier
outlier_id = ( df.loc[(df[f'{col}'] < lowbar) | (df[f'{col}'] > highbar)] ).index
df_rm.loc[outlier_id, col] = np.NaN #fillin number
#plot result
sns.boxplot(x = df_rm[f'{col}'])
plt.axvline(lowbar, color='red')
plt.axvline(highbar, color='red')
plt.show()
#Step4a.check missing data
df_rm.isnull().sum()
#Step4b. Process Missing Data
# Strategy: 1.Drop the missing data(row) if the missing factors below to health outcome
# since outcome is important for building predict model
# 2.Other columns use median to replace missing data
# to avoid leading to invalid conclusions by using mean value to replace missing
# data in asymmetric distribution data sets.
#process null data
df_nonull = df_rm.copy()
# Filter out the features in the group of "health outcomes"
outcomes = ['v001_rawvalue','v002_rawvalue','v036_rawvalue','v042_rawvalue','v037_rawvalue']
for col in df_nonull.columns:
if df_rm[col].isnull().sum() !=0:
if col in outcomes:
df_nonull.dropna(subset = [col], inplace = True)
else:
median = df_nonull[col].median()
df_nonull[col].fillna(median, inplace = True)
df_nonull.isnull().sum()
df_nonull.describe().transpose()
#check if there's any duplicated data
df_nonull.duplicated().sum()
#Step 5.Visulization
#5a.histograms
df_nonull.hist(figsize=(20, 20), bins = 15)
plt.show()
#5b.correlation map
corr = df_nonull.corr()
plt.figure(figsize=(40,40))
sns.heatmap(corr, square=True, annot=True,cmap="RdBu_r")
plt.show()
According to information above, it shows that premature death(v001) has strong correlation(corr > 0.7) with Poor or fair health (v002), Poor physical health days (v036), and Children in poverty (v024).
There are some features are negligible correlation(corr < 0.5 or > -0.5), such as Adult obesity (v011), Access to exercise opportunities (v132), Alcohol-impaired driving deaths (v134),etc.
#Step6. Select Important Features, corr >=0.5
target_corr = corr['v001_rawvalue']
important_features = target_corr[abs(target_corr) >= 0.5].keys()
df_important = df_nonull[important_features]
print(important_features)
#Step 7.Normalization
from mpl_toolkits.mplot3d.axes3d import Axes
from sklearn.preprocessing import MinMaxScaler
# Create a standardizer
scaler = MinMaxScaler(feature_range=(0, 1))
# Select data
X = df_important
# Transform the data
X_scaled = scaler.fit_transform(X)
print(X_scaled.shape)
# Plot
fig, axes = plt.subplots(ncols=6, nrows=3, figsize=(30, 15))
for i, feature in enumerate(important_features):
ax = axes[i//6, i%6]
# Plot the histogram
ax.hist(X_scaled[i, :],bins=10)
ax.set_title(f'{feature}')
ax.set_xlim(0, 1)
plt.show()
2) Answer:
a. Development of clustering model:
# Filter out the features in the group of "health outcomes" and "health behaviors"
selected_measures = ['v001_rawvalue','v002_rawvalue','v036_rawvalue','v042_rawvalue','v037_rawvalue','v009_rawvalue','v011_rawvalue','v133_rawvalue','v070_rawvalue',
'v132_rawvalue','v049_rawvalue','v014_rawvalue']
# make sure selected measures are in the important features
measure_idx = []
X_c_labels = []
for measure in selected_measures:
if measure in important_features:
# find the index
id = list(important_features).index(measure)
X_c_labels.append(measure)
measure_idx.append(id)
else:
print(f'{measure} not in important features!')
# Matrix to be clustered
X_c = X_scaled[:, measure_idx]
print(X_c.shape)
#Run K-Means and draw the elbow graph to select the optimal k
from sklearn.cluster import KMeans
inter = []
r = range(1,15)
for k in r:
kmeansModel = KMeans(n_clusters= k, random_state=46)
clusters_pred = kmeansModel.fit_predict(X_c)
inter.append(kmeansModel.inertia_)
plt.figure(figsize=(16,8))
plt.plot(r, inter, 'bx-')
plt.xlabel('k')
plt.ylabel('Intertia')
plt.xticks(r)
plt.title('The Elbow Method')
plt.show()
b. Determination of number of clusters:
Based on the result of the elbow graph, I determine k = 3 as number of clusters.
# Are there any noteworthy groupings of counties?
# According to the elbow and silhouette score, choose k = 3
k = 3
kmeansModel = KMeans(n_clusters= k, random_state=46)
clusters_pred= kmeansModel.fit_predict(X_c)
df_important['cluster'] = clusters_pred
#Plot the clustering result with features
N = 10
id_figure = 0
ncols = 5
nrows = 9
fig, axes = plt.subplots(ncols=ncols, nrows=nrows, figsize=(5*ncols, 5*nrows))
for i in range(0, N):
for j in range(i+1, N):
ax = axes[id_figure//ncols, id_figure%ncols]
ax.scatter(X_c[:, i], X_c[:, j], c=df_important['cluster'], s=20, cmap='viridis')
centers = kmeansModel.cluster_centers_
ax.scatter(centers[:, i], centers[:, j], c='black', s=100, alpha=0.7);
ax.set_xlabel(f"1st feature={X_c_labels[i]}")
ax.set_ylabel(f"2nd feature={X_c_labels[j]}")
id_figure += 1
plt.show()
Based on the graphs, the Premature death (v001) versus the Poor or fair health (v002) clustering has more clear boundaries. It indicates that both health outcomes are highly related. Besides, the Poor physical health days (V036) feature versus Physical inactivity (v070) shows similar pattern. It indicate that physical inactivity might lead to more poor physical health days.
3) Answer:
a. Development of first supervised learning model predicting premature death:
b. Development of second supervised learning model predicting premature death:
c. List of 5 most important factors influencing premature death, based on two developed models:
# Prepare data
from sklearn.model_selection import train_test_split
# Define inputs and targets
# target: premature death
y = df_important['v001_rawvalue']
X = df_important.iloc[: , 5:]
print(f'target shape: {y.shape}')
print(f'features shape: {X.shape}')
# #split data into train and test group
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
# Model 1. Linear Regression
from sklearn.linear_model import LinearRegression
# # Linear Regression model
model_linear = LinearRegression()
model_linear.fit(X_train, y_train)
inter = model_linear.intercept_
coef = model_linear.coef_
print('Interception : ', inter)
print('Coeficient : ', coef)
plt.stem(range(len(coef)), coef)
plt.xticks(range(len(coef)), df_important.keys()[4:], rotation=90)
plt.show()
The five most important factors predicting premature death of the Linear Regression model is Low birthweight (v037_rawvalue), Food environment index (v133_rawvalue), Physical inactivity (v070_rawvalue), Teen births (v014_rawvalue) , and Some college (v069_rawvalue).
# Model 2: Gradient Boosting
from sklearn import ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
params = {
"n_estimators": 500,
"max_depth": 4,
"min_samples_split": 5,
"learning_rate": 0.01,
"loss": "squared_error",
}
model_gb = ensemble.GradientBoostingRegressor(**params)
model_gb.fit(X_train, y_train)
#check iteration deviance
test_score = np.zeros((params["n_estimators"],), dtype=np.float64)
for i, y_pred in enumerate(model_gb.staged_predict(X_test)):
test_score[i] = model_gb.loss_(y_test, y_pred)
fig = plt.figure(figsize=(6, 6))
plt.subplot(1, 1, 1)
plt.title("Deviance")
plt.plot(
np.arange(params["n_estimators"]) + 1,
model_gb.train_score_,
"b-",
label="Training Set Deviance",
)
plt.plot(
np.arange(params["n_estimators"]) + 1, test_score, "r-", label="Test Set Deviance"
)
plt.legend(loc="upper right")
plt.xlabel("Boosting Iterations")
plt.ylabel("Deviance")
fig.tight_layout()
plt.show()
#important features
#reference:https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-regression-py
feature_importance = model_gb.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + 0.5
fig = plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.barh(pos, feature_importance[sorted_idx], align="center")
plt.yticks(pos, np.array(df_important.keys()[4:])[sorted_idx])
plt.title("Feature Importance (MDI)")
result = permutation_importance(
model_gb, X_test, y_test, n_repeats=10, random_state=0, n_jobs=2
)
sorted_idx = result.importances_mean.argsort()
plt.subplot(1, 2, 2)
plt.boxplot(
result.importances[sorted_idx].T,
vert=False,
labels=np.array(df_important.keys()[4:])[sorted_idx],
)
plt.title("Permutation Importance (test set)")
fig.tight_layout()
plt.show()
The five most important factors predicting premature death of the Gradient Boosting model is Injury deaths (v135_rawvalue), Some college (v069_rawvalue), Children in poverty (v024_rawvalue), Excessive drinking (v049_rawvalue) , and Low birthweight (v037_rawvalue).
4) Answer:
a. Evaluating accuracy of the two supervised learning models:
# Using cross validation
from sklearn.model_selection import cross_validate
# Cross-validation
scoring = "neg_mean_squared_error"
linear_scores = cross_validate(model_linear, X_train, y_train, scoring=scoring, return_estimator=True, cv=5)
gb_scores = cross_validate(model_gb, X_train, y_train, scoring=scoring, return_estimator=True, cv=5)
# Which one is better? Linear and polynomial
print("Linear regression negative mean_squared_error:", linear_scores["test_score"].mean())
print("GB negative mean_squared_error:", gb_scores["test_score"].mean())
print("Difference:", linear_scores["test_score"].mean() - gb_scores["test_score"].mean())
# Testing
# Accuracy of two models
score_linear = model_linear.score(X_test, y_test)
score_gb = model_gb.score(X_test, y_test)
# Scores
print('Linear Model Score: ', score_linear)
print('GB Model Score: ', score_gb)
# Mean squared error (MSE)
mse = mean_squared_error(y_test, model_linear.predict(X_test))
print("The mean squared error (MSE) on Linear Model test set: {:.4f}".format(mse))
mse = mean_squared_error(y_test, model_gb.predict(X_test))
print("The mean squared error (MSE) on GB Model test set: {:.4f}".format(mse))
Gradient Boosting model is more accurate because it has less mean squared error on training data and higher score(accuracy) on testing data.
b. Recommendations for reducing premature death in Allegheny County:
Based on the feature_importance analysis of the Gradient Boosting model, the Allegheny County can increase iCU facilities and improve process to lower down the rate of injury death(V135). It should also create some program to support poor children and avoid premature death caused by hunger and lack of resources.